In [1]:
# Necessary imports
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
In [2]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder()
ae = AstorError()
pipe = Pipeline([gastf, rbn, gi, fe, ke, ae])
a = pipe.transform(a)
print(ae.get_summary())
In [3]:
coverage_general = []
number_templates_general = []
avg_dist_general = []
avg_sim_general = []
labels = []
for value in [1200, 500, 100, 10, 1]:
print ('Calculating for value: ',value)
a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = value)
ae = AstorError()
pipe = Pipeline([gastf, rbn, gi, fe, ke, ae])
a = pipe.transform(a)
avg_dist_general.append(ae.average_distance())
avg_sim_general.append(ae.average_similarity())
coverage_general.append(ae.get_percent_coverage())
number_templates_general.append(ae.get_unique_templates())
labels.append(ke.get_labels())
In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20, 20)
x = [1200, 500, 100, 10, 1]
fig, axes = plt.subplots(2,2)
n2, = axes[0,0].plot(x, avg_dist_general, label = 'Average Distance (All calls are the same)')
axes[0,0].set_title("Average edit distance")
axes[0,0].set_xlabel('Number of templates')
axes[0,0].set_ylabel('Average edit distance')
n2, = axes[0,1].plot(x, avg_sim_general, label = 'Average Similarity (All calls are the same)')
axes[0,1].set_title("Average matching characters")
axes[0,1].set_xlabel('Number of templates')
axes[0,1].set_ylabel('Average matching characters')
n2, = axes[1,0].plot(x, coverage_general, label = 'Coverage (All calls are the same)')
axes[1,0].set_title("Coverage of templates")
axes[1,0].set_xlabel('Number of templates')
axes[1,0].set_ylabel('Coverage of templates')
n2, = axes[1,1].plot(x, number_templates_general, label = 'Number of Templates (All calls are the same)')
axes[1,1].set_title("Number of templates")
axes[1,1].set_xlabel('Number of templates')
axes[1,1].set_ylabel('Number of templates')
Out[5]:
In [35]:
def get_vec_sizes(v):
r = {}
for el in v:
if el not in r:
r[el] = 0
r[el] += 1
return (list(r.values()))
def num_one(v):
total = 0
for el in v:
if el == 1:
total += 1
return total
In [56]:
import numpy as np
plt.rcParams['figure.figsize'] = (10, 5)
[np.median(get_vec_sizes(v)) for v in labels]
[num_one(get_vec_sizes(v)) for v in labels]
n1, = plt.plot(x[:-1],[np.median(get_vec_sizes(v)) for v in labels[:-1]], label='Median size of cluster' )
n2, = plt.plot(x,[num_one(get_vec_sizes(v)) for v in labels], label='Number of clusters with one element')
plt.legend(handles=[n1,n2])
plt.xlabel('Number of clusters')
plt.ylabel('Number of examples')
Out[56]:
In [71]:
plt.rcParams['figure.figsize'] = (10, 30)
fig, axes = plt.subplots(len(labels))
for i in range(len(labels)):
axes[i].hist(get_vec_sizes(labels[i]), bins = 100)
axes[i].set_xlabel('Number of data points')
axes[i].set_ylabel('Number of clusters')
title = str(x[i]) + " Clusters"
if x[i] == 1:
title = "1 Cluster"
axes[i].set_title(title)
In [ ]: